This notebook contains a simple PyTorch neural network that predicts covid outcome (mild illness, hospitalised, died). This analysis is performed on fabricated data I created.¶

In [1]:
scale_data = True

Import modules and data¶

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from math import ceil
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline
In [4]:
df = pd.read_excel('Covid May 2020 data.xlsx', sheet_name='Data')
df.tail()
Out[4]:
age bmi HbA1c_mmol_mol has_asthma has_high_blood_pressure result
10444 94 33.8 67 0 1 died
10445 90 23.3 100 0 1 hospitalised
10446 83 28.6 33 1 0 died
10447 31 18.2 80 0 0 mild illness
10448 26 26.9 40 1 0 mild illness

Prepare data¶

In [5]:
# Scale data
def scale_dataframe(df):
    scaler = MinMaxScaler()
    df['age'] = scaler.fit_transform(df[['age']])
    df['bmi'] = scaler.fit_transform(df[['bmi']])
    df['HbA1c_mmol_mol'] = scaler.fit_transform(df[['HbA1c_mmol_mol']])
    
    return df

if scale_data:
    df = scale_dataframe(df)

df.head()
Out[5]:
age bmi HbA1c_mmol_mol has_asthma has_high_blood_pressure result
0 0.243902 0.263158 0.472222 1 0 mild illness
1 0.829268 0.257895 0.347222 0 0 hospitalised
2 0.451220 0.600000 0.736111 0 1 hospitalised
3 0.719512 0.110526 0.444444 0 1 died
4 0.060976 0.963158 0.625000 0 0 mild illness
In [6]:
# Replace targets with ints
result_map = {'mild illness': 0, 'hospitalised': 1, 'died': 2}
result_map_reverse = {value: key for key, value in result_map.items()}

df['result'] = df['result'].replace(result_map)
In [7]:
# Set X, y and convert them to np arrays
X = df.drop('result', axis=1)
num_cols = X.shape[1]

y = df['result']
num_outputs = y.nunique()

X = X.values
y = y.values

h1_layers = ceil((num_cols + num_outputs)/2)
In [8]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Convert X features to float tensors
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)

# Convert y labels to long tensors
y_train = torch.LongTensor(y_train)
y_test = torch.LongTensor(y_test)

Create model¶

In [9]:
class Model(nn.Module):
  def __init__(self, in_features=num_cols, h1=h1_layers, out_features=num_outputs):
    super().__init__()
    self.fc1 = nn.Linear(in_features, h1)
    self.out = nn.Linear(h1, out_features)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.out(x)

    return x

torch.manual_seed(49)
model = Model()

Train model¶

In [10]:
losses = []

epochs = 250
loss_function = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)

for i in range(epochs):
  y_pred = model.forward(X_train) # predicted results
  loss = loss_function(y_pred, y_train) # predicted values vs the y_train

  losses.append(loss.detach().numpy())

  if i % 10 == 0:
    print(f'At epoch {i} loss was: {loss}')

  optimiser.zero_grad()
  loss.backward()
  optimiser.step()
At epoch 0 loss was: 1.1762192249298096
At epoch 10 loss was: 1.0961493253707886
At epoch 20 loss was: 1.0698022842407227
At epoch 30 loss was: 1.0202093124389648
At epoch 40 loss was: 0.9572418928146362
At epoch 50 loss was: 0.8754114508628845
At epoch 60 loss was: 0.7935022711753845
At epoch 70 loss was: 0.720582902431488
At epoch 80 loss was: 0.6599651575088501
At epoch 90 loss was: 0.6124850511550903
At epoch 100 loss was: 0.5760200023651123
At epoch 110 loss was: 0.5480552315711975
At epoch 120 loss was: 0.5266116857528687
At epoch 130 loss was: 0.510050356388092
At epoch 140 loss was: 0.49717631936073303
At epoch 150 loss was: 0.48710158467292786
At epoch 160 loss was: 0.47918781638145447
At epoch 170 loss was: 0.47297823429107666
At epoch 180 loss was: 0.4681414067745209
At epoch 190 loss was: 0.46439245343208313
At epoch 200 loss was: 0.46149691939353943
At epoch 210 loss was: 0.45926496386528015
At epoch 220 loss was: 0.45754602551460266
At epoch 230 loss was: 0.4562242329120636
At epoch 240 loss was: 0.4552067220211029

Graph epochs and losses¶

In [11]:
plt.plot(range(epochs), losses)
plt.ylabel("Loss")
plt.xlabel('Epoch')
Out[11]:
Text(0.5, 0, 'Epoch')

Evaluate model and find feature importance¶

In [12]:
# Determine feature importance
def permutation_feature_importance(model, X_test, y_test, column_names, n_iterations=10):
    baseline_accuracy = evaluate_accuracy(model, X_test, y_test)
    print(f'Accuracy (%): {baseline_accuracy*100:.2f}')
    feature_importances = np.zeros(X_test.shape[1])
    
    for i in range(X_test.shape[1]):
        accuracy_scores = []
        for _ in range(n_iterations):
            X_test_permuted = X_test.detach().clone()
            X_test_permuted[:, i] = X_test_permuted[:, i][torch.randperm(X_test.shape[0])]
            accuracy = evaluate_accuracy(model, X_test_permuted, y_test)
            accuracy_scores.append(accuracy)
        feature_importances[i] = baseline_accuracy - np.mean(accuracy_scores)
    
    return feature_importances

# Evaluate model (on test dataset)
def evaluate_accuracy(model, X_test, y_test):
    with torch.no_grad(): #  deactivate gradient computations (required in training but not testing)
        y_pred = model(X_test)        
        predicted = y_pred.argmax(dim=1) # convert predicted probabilities to class labels
        accuracy = (predicted == y_test).sum().item() / len(y_test)

    return accuracy

feature_importances = permutation_feature_importance(model, X_test, y_test, df.columns.tolist())

importance_dict = {}
for column, importance in zip(df.columns.tolist(), feature_importances):
    print(f"'{column}' importance score: {importance}")
    importance_dict[column] = importance
Accuracy (%): 79.47
'age' importance score: 0.4310526315789474
'bmi' importance score: 0.13339712918660296
'HbA1c_mmol_mol' importance score: 0.002105263157894721
'has_asthma' importance score: 0.0011004784688994906
'has_high_blood_pressure' importance score: 0.0014354066985645675
In [13]:
# Create the bar chart
fig, ax = plt.subplots()
ax.bar(list(importance_dict.keys()), list(importance_dict.values()))
ax.set_ylabel("Importance Score")
plt.xticks(rotation=90)
plt.show()

Make prediction on new data¶

In [14]:
df_new = pd.read_excel('Data to predict on.xlsx', sheet_name='Data')
df_new.tail()
Out[14]:
age bmi HbA1c_mmol_mol has_asthma has_high_blood_pressure
1125 23 34.5 36 0 1
1126 46 28.8 94 0 0
1127 72 28.7 67 1 1
1128 91 19.8 36 0 1
1129 69 16.7 81 0 0
In [15]:
if scale_data:
    df_new_for_model = scale_dataframe(df_new.copy())
else:
    df_new_for_model = df_new
In [16]:
predictions = []

for index, row in df_new_for_model.iterrows():

    new_person = torch.tensor([row['age'],
                           row['bmi'],
                           row['HbA1c_mmol_mol'],
                           row['has_asthma'],
                           row['has_high_blood_pressure']],
                           dtype=torch.float32)                         

    with torch.no_grad():
        pred1 = model(new_person)
    max_index = pred1.argmax().item()
    result = result_map_reverse[max_index]
    predictions.append(result)

df_new['prediction'] = predictions

df_new.head()
Out[16]:
age bmi HbA1c_mmol_mol has_asthma has_high_blood_pressure prediction
0 30 32.6 51 1 0 mild illness
1 40 20.5 89 0 0 mild illness
2 94 21.0 34 1 0 died
3 79 21.7 36 0 1 hospitalised
4 33 32.5 40 0 1 mild illness

Save model locally¶

In [17]:
torch.save(model.state_dict(), 'Covid Multi-Class.pt')